In this document, we assess our predictions of a simple definition of direction. First we get the data.

# Load the evaluations
save_data <- readRDS(file.path(analysis_dir, paste(analysis_type,"prediction.rds",sep = "-")))
evals <- save_data$evals
evals <- evals %>% rename(target_date = target_end_date)
forecasting_task <- save_data$forecasting_task
forecaster_args <- save_data$forecaster_args

# Load the trained models
forecasters <- unique(evals$forecaster)
trained_models <- lapply(forecasters, FUN = function(forecaster){
  readRDS_from_dir(file.path(model_dir,forecaster))
})
names(trained_models) <- forecasters

# Create coefficient dfs
# coefs <- lapply(trained_models,create_coef_df) %>% 
#   bind_rows(.id = "forecaster") %>%
#   mutate(feature_name = parse_feature_name(feature_name)) 

# Omit whatever was asked to be omitted.
evals <- evals %>% filter(!(forecaster %in% omitted_forecasters) & 
                          !(ahead %in% omitted_aheads) &
                          !(forecast_date %in% omitted_forecast_dates))
# coefs <- coefs %>% filter(!(forecaster %in% omitted_forecasters) & 
#                       !(ahead %in% omitted_aheads) &
#                       !(forecast_date %in% omitted_forecast_dates))

Here’s some information about our forecasting problem: specifically, the forecasting task, the definition of response, the features, and other information on training.

Forecasting task

Our forecasting task is:

Response variable

We transform the numeric response into a factor for classification. We will call this variable direction. The parameters used to define direction are

Let \(N_i^{t}(w)\) denote \(w\)-day trailing average of the response variable, observed at a given {location = i,day = t}. A given {location = i,forecast_date = t,ahead = a} triplet qualifies as an

\[ \frac{{N}_i^{t + a}(w) - {N}_i^{t}(w)}{{N}_{i}^{t}(w)} \geq u_0~~\textrm{and}~~ {N}_i^{t}(w) \geq t_0 \] where \(u_0\) is the upswing threshold, and \(t_0\) is the minimum threshold.

\[ \frac{{N}_i^{t + a}(w) - {N}_i^{t}(w)}{{N}_{i}^{t}(w)} \leq d_0~~\textrm{and}~~ {N}_i^{t}(w) \geq t_0 \] where \(d_0\) is the downswing threshold.

Features

As features, we use ratios of indicators, evaluated at different lags. The parameters used to define features are:

To be precise, let \(X_i^{t}(w)\) denote \(w\)-day trailing average of an indicator variable, observed at a given {location = i,day = t}. Then on forecast date \(d\), we use as features variables of the form \[ \frac{X_i^{d - \ell}(w)}{X_i^{d - \ell - w}(w)} \]

Training information

There are a few other relevant parameters used to determine our training set.

Data wrangling

We restrict outselves to evaluation on only the subset of tasks that all forecasters have completed.

## Handle missing predictions.

# Filter to common forecast dates
common_forecast_dates <- find_common_forecast_dates(evals)
evals <- evals %>% filter(forecast_date %in% common_forecast_dates)
# coefs <- coefs %>% filter(forecast_date %in% common_forecast_dates)

# Filter to tasks all forecasters have completed.
na_tasks <- evals %>% 
  filter(is.na(value) | is.na(actual)) %>%
  select(geo_value, forecast_date, ahead) %>%
  distinct()

evals <- evals %>%
  anti_join(na_tasks)

Now we aggregate our evaluations (using power at different levels of type I error, and area under the ROC curve), and our estimated coefficients (using median). Aggregations are over all time and per forecast yearmonth.

### Aggregations over all forecast dates.
### Each metric is one-class-against-all.

roc_by_alpha <- function(x,alpha = seq(.01,1,by = .02)){
  power <- roc(x$value,x$actual,alpha)
}
direction = unique(evals$direction)
all_day_roc <- vector(mode = "list",length = length(direction)); names(all_day_roc) <- direction
all_day_auc <- vector(mode = "list",length = length(direction)); names(all_day_auc) <- direction
for(d in direction)
{
  # ROC
  all_day_roc[[d]] <- evals %>%
    filter(direction == d) %>%
    mutate(actual = if_else(actual == d,1,0)) %>%
    select(forecaster,ahead,forecast_date,value,actual) %>%
    group_by(forecaster,ahead) %>%
    group_modify( ~ roc_by_alpha(.x))

  # AUC
  all_day_auc[[d]] <- evals %>%
    filter(direction == d) %>%
    mutate(actual = if_else(actual == d,1,0)) %>%
    select(forecaster,ahead,forecast_date,value,actual) %>%
    group_by(forecaster,ahead) %>%
    summarize(auc = auc(value,actual,alpha = seq(0,1,by = .01)),.groups = "drop") 
}

# Coefficients
# all_day_coefs <- coefs %>%
#   group_by(ahead, feature_name, forecaster) %>% 
#   summarise(value = median(value), .groups = "drop")

### Aggregation per forecast-year month.

per_yearmonth_roc <- vector(mode = "list",length = length(direction)); names(per_yearmonth_roc) <- direction
per_yearmonth_auc <- vector(mode = "list",length = length(direction)); names(per_yearmonth_auc) <- direction

for(d in direction)
{
  # ROC per year and month
  per_yearmonth_roc[[d]] <- evals %>%
    filter(direction == d) %>%
    mutate(actual = if_else(actual == d,1,0)) %>%
    select(forecaster,ahead,forecast_date,value,actual) %>%
    mutate(forecast_month = lubridate::month(forecast_date),
           forecast_year = lubridate::year(forecast_date)) %>%
    group_by(forecaster,ahead,forecast_month,forecast_year) %>%
    group_modify( ~ roc_by_alpha(.x))
  
  # AUC per year and month
  per_yearmonth_auc[[d]] <- evals %>%
    filter(direction == d) %>%
    mutate(actual = if_else(actual == d,1,0)) %>%
    select(forecaster,ahead,forecast_date,value,actual) %>%
    mutate(forecast_month = lubridate::month(forecast_date),
           forecast_year = lubridate::year(forecast_date)) %>%
    group_by(forecaster,ahead,forecast_month,forecast_year) %>%
    summarize(auc = auc(value,actual,alpha = seq(0,1,by = .01)),.groups = "drop")
}

# # Coefficients
# per_yearmonth_coefs <- coefs %>%
#   mutate(forecast_month = lubridate::month(forecast_date),
#          forecast_year = lubridate::year(forecast_date)) %>%
#   group_by(ahead, feature_name, forecaster,forecast_month,forecast_date) %>% 
#   summarise(value = median(value), .groups = "drop")

EDA

The proportion of places that qualify as an increase in direction, over time.

# Plot the proportion of increases over time.

df <- left_join(
  expand_grid(target_date = unique(evals$target_date),
                  actual = "up"),
  
  evals %>% 
    select(geo_value,target_date,actual) %>%
    group_by(target_date, actual) %>%
    summarize(count = n()) %>%
    summarize(prop = count / sum(count), actual = actual) %>%
    filter(actual == "up")
)

ggplot(df %>% replace_na(list(prop = 0)), aes(x = target_date, y = prop)) +
  geom_line() +
  geom_point() + 
  scale_x_date(date_minor_breaks = "1 month") +
  labs(subtitle = subtitle)

Evaluation

First up, ROC curves by ahead, aggregated over all days.

ROC curves

Direction = up

14 days ahead

ggplot(all_day_roc[["up"]] %>% filter(ahead == 14), 
       aes(x = alpha, y = result, color = forecaster)) +
  geom_point() + 
  geom_line() +
  labs(subtitle = subtitle,
       x = "False positive rate",
       y = "True positive rate")

21 days ahead

ggplot(all_day_roc[["up"]] %>% filter(ahead == 21), 
       aes(x = alpha, y = result, color = forecaster)) +
  geom_point() + 
  geom_line() +
  labs(subtitle = subtitle,
       x = "False positive rate",
       y = "True positive rate")

28 days ahead

ggplot(all_day_roc[["up"]] %>% filter(ahead == 28), 
       aes(x = alpha, y = result, color = forecaster)) +
  geom_point() + 
  geom_line() +
  labs(subtitle = subtitle,
       x = "False positive rate",
       y = "True positive rate")

Direction = steady

14 days ahead

ggplot(all_day_roc[["steady"]] %>% filter(ahead == 14), 
       aes(x = alpha, y = result, color = forecaster)) +
  geom_point() + 
  geom_line() +
  labs(subtitle = subtitle,
       x = "False positive rate",
       y = "True positive rate")

21 days ahead

ggplot(all_day_roc[["steady"]] %>% filter(ahead == 21), 
       aes(x = alpha, y = result, color = forecaster)) +
  geom_point() + 
  geom_line() +
  labs(subtitle = subtitle,
       x = "False positive rate",
       y = "True positive rate")

28 days ahead

ggplot(all_day_roc[["steady"]] %>% filter(ahead == 28), 
       aes(x = alpha, y = result, color = forecaster)) +
  geom_point() + 
  geom_line() +
  labs(subtitle = subtitle,
       x = "False positive rate",
       y = "True positive rate")

Direction = down

14 days ahead

ggplot(all_day_roc[["down"]] %>% filter(ahead == 14), 
       aes(x = alpha, y = result, color = forecaster)) +
  geom_point() + 
  geom_line() +
  labs(subtitle = subtitle,
       x = "False positive rate",
       y = "True positive rate")

21 days ahead

ggplot(all_day_roc[["down"]] %>% filter(ahead == 21), 
       aes(x = alpha, y = result, color = forecaster)) +
  geom_point() + 
  geom_line() +
  labs(subtitle = subtitle,
       x = "False positive rate",
       y = "True positive rate")

28 days ahead

ggplot(all_day_roc[["down"]] %>% filter(ahead == 28), 
       aes(x = alpha, y = result, color = forecaster)) +
  geom_point() + 
  geom_line() +
  labs(subtitle = subtitle,
       x = "False positive rate",
       y = "True positive rate")

Next up, AUC by ahead, aggregated over all dates. The tabs differ in the notion of aggregation:

  • In the first tab, we aggregate by taking AUC over all forecast dates.
  • In the second tab, we group by forecast year and month, then take AUC over all forecast dates.

AUC curves by ahead

All time

Direction = up

ggplot(all_day_auc[["up"]], aes(x = ahead, y = auc, color = forecaster)) +
  geom_point() +
  geom_line() +
  labs(subtitle = subtitle)

Direction = steady

ggplot(all_day_auc[["steady"]], aes(x = ahead, y = auc, color = forecaster)) +
  geom_point() +
  geom_line() +
  labs(subtitle = subtitle)

Direction = down

ggplot(all_day_auc[["down"]], aes(x = ahead, y = auc, color = forecaster)) +
  geom_point() +
  geom_line() +
  labs(subtitle = subtitle)

Per Month

Direction = up

ggplot(per_yearmonth_auc[["up"]], aes(x = ahead, y = auc, color = forecaster)) +
  geom_point() +
  geom_line() +
  labs(subtitle = subtitle) +
  facet_wrap(vars(forecast_year,forecast_month), scales = "free")

Direction = steady

ggplot(per_yearmonth_auc[["steady"]], aes(x = ahead, y = auc, color = forecaster)) +
  geom_point() +
  geom_line() +
  labs(subtitle = subtitle) +
  facet_wrap(vars(forecast_year,forecast_month), scales = "free")

Direction = down

ggplot(per_yearmonth_auc[["down"]], aes(x = ahead, y = auc, color = forecaster)) +
  geom_point() +
  geom_line() +
  labs(subtitle = subtitle) +
  facet_wrap(vars(forecast_year,forecast_month), scales = "free")

Finally, difference in fitted values between forecasters, depending on the truth.

Fitted values

All aheads

Predictions for class = “up”

ggplot(diff_evals %>% filter(direction == "up"), aes(x = diff, fill = forecaster)) +
  geom_histogram(bins = 200) +
  labs(subtitle = subtitle) +
  facet_grid(rows = vars(actual),
             cols = vars(forecaster),
             scales = "fixed") +
  geom_vline(xintercept = 0)

Predictions for class = “steady”

ggplot(diff_evals %>% filter(direction == "steady"), aes(x = diff, fill = forecaster)) +
  geom_histogram(bins = 200) +
  labs(subtitle = subtitle) +
  facet_grid(rows = vars(actual),
             cols = vars(forecaster),
             scales = "fixed") +
  geom_vline(xintercept = 0)

Predictions for class = “down”

ggplot(diff_evals %>% filter(direction == "down"), aes(x = diff, fill = forecaster)) +
  geom_histogram(bins = 200) +
  labs(subtitle = subtitle) +
  facet_grid(rows = vars(actual),
             cols = vars(forecaster),
             scales = "fixed") +
  geom_vline(xintercept = 0)

By ahead

14 days ahead

Predictions for class = “up”
ggplot(diff_evals %>% filter(ahead == 14, direction == "up"), aes(x = diff, fill = forecaster)) +
  geom_histogram(bins = 200) +
  labs(subtitle = subtitle) +
  facet_grid(rows = vars(actual),
             cols = vars(forecaster),
             scales = "fixed") +
  geom_vline(xintercept = 0)

Predictions for class = “steady”
ggplot(diff_evals %>% filter(ahead == 14, direction == "steady"), aes(x = diff, fill = forecaster)) +
  geom_histogram(bins = 200) +
  labs(subtitle = subtitle) +
  facet_grid(rows = vars(actual),
             cols = vars(forecaster),
             scales = "fixed") +
  geom_vline(xintercept = 0)

Predictions for class = “down”
ggplot(diff_evals %>% filter(ahead == 14, direction == "down"), aes(x = diff, fill = forecaster)) +
  geom_histogram(bins = 200) +
  labs(subtitle = subtitle) +
  facet_grid(rows = vars(actual),
             cols = vars(forecaster),
             scales = "fixed") +
  geom_vline(xintercept = 0)

21 days ahead

Predictions for class = “up”
ggplot(diff_evals %>% filter(ahead == 21, direction == "up"), aes(x = diff, fill = forecaster)) +
  geom_histogram(bins = 200) +
  labs(subtitle = subtitle) +
  facet_grid(rows = vars(actual),
             cols = vars(forecaster),
             scales = "fixed") +
  geom_vline(xintercept = 0)

Predictions for class = “steady”
ggplot(diff_evals %>% filter(ahead == 21, direction == "steady"), aes(x = diff, fill = forecaster)) +
  geom_histogram(bins = 200) +
  labs(subtitle = subtitle) +
  facet_grid(rows = vars(actual),
             cols = vars(forecaster),
             scales = "fixed") +
  geom_vline(xintercept = 0)

Predictions for class = “down”
ggplot(diff_evals %>% filter(ahead == 21, direction == "down"), aes(x = diff, fill = forecaster)) +
  geom_histogram(bins = 200) +
  labs(subtitle = subtitle) +
  facet_grid(rows = vars(actual),
             cols = vars(forecaster),
             scales = "fixed") +
  geom_vline(xintercept = 0)

28 days ahead

Predictions for class = “up”
ggplot(diff_evals %>% filter(ahead == 28, direction == "up"), aes(x = diff, fill = forecaster)) +
  geom_histogram(bins = 200) +
  labs(subtitle = subtitle) +
  facet_grid(rows = vars(actual),
             cols = vars(forecaster),
             scales = "fixed") +
  geom_vline(xintercept = 0)

Predictions for class = “steady”
ggplot(diff_evals %>% filter(ahead == 28, direction == "steady"), aes(x = diff, fill = forecaster)) +
  geom_histogram(bins = 200) +
  labs(subtitle = subtitle) +
  facet_grid(rows = vars(actual),
             cols = vars(forecaster),
             scales = "fixed") +
  geom_vline(xintercept = 0)

Predictions for class = “down”
ggplot(diff_evals %>% filter(ahead == 28, direction == "down"), aes(x = diff, fill = forecaster)) +
  geom_histogram(bins = 200) +
  labs(subtitle = subtitle) +
  facet_grid(rows = vars(actual),
             cols = vars(forecaster),
             scales = "fixed") +
  geom_vline(xintercept = 0)

Next up, calibration across time. That is, how does the rate at which we call up compare to the rate of true ups. To do this, we need to transform our fitted values into a class prediction. We do this by thresholding the fitted values. Each forecaster is allowed a separate threshold, which is chosen so that forecaster would have a false positive rate of 25% across time.

# Compute proportion of actual 
threshold <- evals %>%
  filter(actual != "up") %>%
  group_by(ahead,forecaster) %>%
  summarize(t = quantile(value,.75))

prop_pred_up_ar <- evals %>% 
  filter(forecaster == "AR3") %>%
  left_join(threshold) %>%
  filter(direction == "up") %>%
  group_by(ahead,target_date,forecaster) %>%
  summarize(prop_ar = mean(value > t,na.rm = T)) %>%
  select(-forecaster)

prop_pred_up_rest <- evals %>% 
  filter(forecaster != "AR3") %>%
  left_join(threshold) %>%
  filter(direction == "up") %>%
  group_by(ahead,target_date,forecaster) %>%
  summarize(prop = mean(value > t,na.rm = T)) %>%
  rename(var = forecaster)

prop_pred_up <- left_join(prop_pred_up_rest,prop_pred_up_ar)

prop_actual_up <- evals %>%
  group_by(ahead,target_date) %>%
  summarize(prop_actual = mean(ifelse(actual == "up",1,0),na.rm = T))

prop_up <- left_join(prop_pred_up,prop_actual_up)

# Plotting info
groups <- unique(prop_up$var)
group_colors <- c(hue_pal()(length(groups) + 1)[-1])
names(group_colors) <- c(groups)

Calibration across time

14 days ahead

ggplot(prop_up %>% filter(ahead == 14), aes(x = target_date)) +
  geom_line(aes(y = prop,color = var)) +
  geom_point(aes(y = prop,color = var)) + 
  geom_line(aes(y = prop_actual), color = "black") +
  geom_point(aes(y = prop_actual), color = "black") + 
  geom_line(aes(y = prop_ar), color = hue_pal()(1)) +
  geom_point(aes(y = prop_ar), color = hue_pal()(1)) +
  scale_x_date(date_minor_breaks = "1 month") +
  labs(subtitle = subtitle) +
  scale_colour_manual( values= group_colors ) +
  facet_wrap(vars(var),ncol = 1)

21 days ahead

ggplot(prop_up %>% filter(ahead == 21), aes(x = target_date)) +
  geom_line(aes(y = prop,color = var)) +
  geom_point(aes(y = prop,color = var)) + 
  geom_line(aes(y = prop_actual), color = "black") +
  geom_point(aes(y = prop_actual), color = "black") + 
  geom_line(aes(y = prop_ar), color = hue_pal()(1)) +
  geom_point(aes(y = prop_ar), color = hue_pal()(1)) +
  scale_x_date(date_minor_breaks = "1 month") +
  labs(subtitle = subtitle) +
  scale_colour_manual( values= group_colors ) +
  facet_wrap(vars(var),ncol = 1)

28 days ahead

ggplot(prop_up %>% filter(ahead == 28), aes(x = target_date)) +
  geom_line(aes(y = prop,color = var)) +
  geom_point(aes(y = prop,color = var)) + 
  geom_line(aes(y = prop_actual), color = "black") +
  geom_point(aes(y = prop_actual), color = "black") + 
  geom_line(aes(y = prop_ar), color = hue_pal()(1)) +
  geom_point(aes(y = prop_ar), color = hue_pal()(1)) +
  scale_x_date(date_minor_breaks = "1 month") +
  labs(subtitle = subtitle) +
  scale_colour_manual( values= group_colors ) +
  facet_wrap(vars(var),ncol = 1)

Next up, trajectories. We plot the fitted values for class = up, at ahead = 14,21,28, for each Monday. We also plot the true number of hospitalizations, normalized to lie in [0,1].

Warning: the fitted values and true number of hospitalizations are not on the same scale. These kinds of graphs can be visually misleading.

response_data <- save_data$response_data %>%
  select(geo_value,time_value,value) %>%
  rename(target_date = time_value,actual = value) %>%
  group_by(geo_value) %>%
  mutate(actual = actual/max(actual)) # Transform to be in [0,1].

trajectory_df <- left_join(evals %>% 
                     filter(lubridate::wday(forecast_date) == 2 & 
                              ahead %in% c(14,21,28) & direction == "up") %>%
                     select(geo_value,target_date,value,forecaster,forecast_date),
                   response_data)

Trajectory Plots

## Trajectory plots
ggplot(trajectory_df, aes(x = target_date)) +
  geom_line(aes(y = value,color = forecaster, 
                group = interaction(forecast_date,forecaster))) +
  geom_point(aes(y = value, color = forecaster)) +
  geom_line(aes(y = actual),col = "black") + 
  facet_wrap(vars(geo_value), ncol = 1)

knitr::knit_exit()